This code is showcase of the diverse natural language processing techniques one can use to :
Disclaimer : it has been used for a conference workshop, and does not reflect best coding practices
import os, sys
import codecs
import gensim
import pickle
import nltk
import string
import random
import sqlite3
import time
import pandas as pd
import numpy as np
import tqdm
import matplotlib.pyplot as plt
from IPython.display import Image
%matplotlib inline
'''Reproducibility settings'''
random.seed(100)
We start here by loading our dataset stored in a sqlite database and inspect very quickly. We also load the config path where we will store any (hyper-)parameters or parameters we will be willing to use later in our code.
data_path = '../data'
model_path = '../model'
#config_path = '../config'
This dataset consists of reviews of fine foods from amazon. The data span a period of more than 10 years, including all ~500,000 reviews up to October 2012. Reviews include product and user information, ratings, and a plaintext review. We also have reviews from all other Amazon categories. As a reminder it can be found here on kaggle or here on SNAP (original source).
This dataset consists of a single table, Reviews.
con = sqlite3.connect(os.path.join(data_path, 'database.sqlite'))
cursor = con.cursor()
%time table = pd.read_sql_query("SELECT * FROM Reviews where Score <> 3;", con)
table.head()
The columns in the table are:
IdProductId - unique identifier for the productUserId - unqiue identifier for the userProfileNameHelpfulnessNumerator - number of users who found the review helpfulHelpfulnessDenominator - number of users who indicated whether they found the review helpfulScore - rating between 1 and 5Time - timestamp for the reviewSummary - brief summary of the reviewText - text of the reviewThis is a perfect example of dataset that a business might be able to gather by sending survey to its customers. We have a Score that we can use to create sentiment categories (positive/neutral/negative) and we have a free-text field Text where customers can freely enter a review. Data will probably be messy and unbalanced (like usual), hence the perfect showcase here.
table.describe()
plt.hist(table.Score)
plt.title("Histogram of Scores.")
plt.show()
We can assume that a score of 4 or greater is Positive and that a score lower than 3 strictly is Negative. The rest will be Neutral
def assign_sentiment(score, label=False):
if score > 3:
return "Positive" if not label else 1
elif score < 3:
return "Negative" if not label else 2
else:
return "Neutral" if not label else 3
table['Sentiment'] = table.Score.apply(assign_sentiment)
table['ClassLabel'] = table.Score.apply(assign_sentiment, label=True)
table.Sentiment.value_counts()
As predicted, the dataset is completely imbalanced. We will balance it later probably by sub-sampling here but we can also make use of different sampling techniques such as SMOTE (see the imbalanced-learn package for more detailed and this paper on SMOTE).
plt.hist([table.ProductId.value_counts()], bins=np.arange(0,1000,50))
plt.show()
data = table[['ProductId', 'Score', 'Summary', 'Text', 'Sentiment', 'ClassLabel']]
data.head()
We can now decide to focus only on the relevant columns and dump the rest.
data.to_csv(os.path.join(data_path, 'data_raw.csv'))
Now that we have our dataset , we want to make sure our reviews only have characters we want. We remove '#' characters but keep the words after the '#' sign because they might be relevant (eg: #onasugardiet)
def sanitize_data(data_path, filename):
input_file = codecs.open(os.path.join(data_path, filename + '.csv'), "r", encoding='utf-8', errors='replace')
output_file = open(os.path.join(data_path, filename + '_clean.csv'), "w")
def sanitize_characters(raw, clean):
for line in input_file:
out = line
output_file.write(line)
return sanitize_characters(input_file, output_file)
sanitize_data(data_path, 'data_raw')
We can now reload the data.
data = pd.read_csv(
open(os.path.join(data_path, 'data_raw_clean.csv'), 'r'),
index_col = 'Unnamed: 0',
error_bad_lines=False,
engine='c')
data.head()
Like we often say, "garbage in, garbage out". This is very true in the data science field when it comes to using machine learning or any techniques to play with data. In order to clean the data, we will here using several techniques to come up with a cleaner text. A clean dataset will allow a model to learn meaningful features and not overfit on irrelevant noise.
Let's grab only 3 sentences to explain the pipeline.
text_test = data.Text.loc[59:62].tolist()
text_test
import re
# compress characters
def compress_characters(text):
return re.sub(r"([" + re.escape(string.punctuation) + r"])\1+", r"\1", text)
pipeline_test = [compress_characters(x) for x in text_test]
pipeline_test
def standardize_text(text):
text = text.replace(r"http\S+", "")
text = text.replace(r"http", "")
text = text.replace(r"@\S+", "")
text = text.replace("\\", "")
text = text.replace("/", "")
text = text.replace(r"[^A-Za-z0-9(),*!?@\'\`\"\_\n]", " ")
text = text.replace(r"@", "at")
text = text.lower()
return text
pipeline_test = [standardize_text(x) for x in pipeline_test]
pipeline_test
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt')
from nltk import sent_tokenize
pipeline_test = [sent_tokenize(x) for x in pipeline_test]
pipeline_test
from nltk.tokenize import RegexpTokenizer
# tokrenize sentences
def tokenize(comment):
tokenizer = RegexpTokenizer(r'\w+')
try:
tokens = tokenizer.tokenize(comment)
return tokens
except:
return 'NC'
pipeline_test = [[tokenize(x) for x in text] for text in pipeline_test]
_ = [print(x) for x in pipeline_test]
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
def clean_tokens(token_list, stop_words):
filtered_tokens = token_list
filtered_tokens = [word for word in token_list if word not in stop_words]
filtered_tokens = [''.join(c for c in s if c not in string.punctuation) for s in filtered_tokens]
filtered_tokens = [s for s in filtered_tokens if s]
return filtered_tokens
pipeline_test = [[clean_tokens(x, stop_words) for x in text] for text in pipeline_test]
_ = [print(x) for x in pipeline_test]
enjoy and enjoyed won't be represented by the same word. However, at some point later in this notebook, we might want to consider them as having exactly the same meaning. For this purpose, we will lemmatize our tokens as following :from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
# lemmatize words
def lemmatize_tokens(token_list):
token_lemmatized = [WordNetLemmatizer().lemmatize(word,'v') for word in token_list]
token_processed = [wordnet.morphy(word) for word in token_lemmatized]
token_processed_final = [word for word in token_processed if word is not None]
return token_processed_final
pipeline_test = [[lemmatize_tokens(x) for x in text] for text in pipeline_test]
_ = [print(x) for x in pipeline_test]
The token lists look nothing like the initial but we have now several degrees of processing that we can use and try for our tasks.
We introduce here the transform_data function that runs all the tranformations listed above. It then creates 3 new columns:
tokens : clean tokenstokens_clean : clean tokens without stopwordstokens_lemme : clean lemmatized tokens without stopwordsdef transform_data(data, text_field, stop_words):
print('Compressing characters...')
data[text_field] = data[text_field].apply(compress_characters)
print('Standardizing text...')
data[text_field] = data[text_field].apply(standardize_text)
print('Splitting sentences...')
data['list_sentences'] = data[text_field].apply(sent_tokenize)
print('Tokenizing text...')
data['tokens'] = data['list_sentences'].apply(lambda x : [tokenize(text) for text in x])
print('Cleaning tokens...')
data['tokens_clean'] = data['tokens'].apply(lambda x : [clean_tokens(item, stop_words) for item in x if item not in ['NC']])
print('Lemmatizing tokens...')
data['tokens_lemme'] = data['tokens_clean'].apply(lambda x : [lemmatize_tokens(item) for item in x])
return data
%time data_clean = transform_data(data, text_field='Text', stop_words=stop_words)
If it can be very useful to split tokens by sentences (especially for a classification task), we will simplify our tokens input by flattening our list of tokens :
tokens_flattokens_clean_flattokens_lemme_flatdef flatten_tokens(df, syntaxes):
for i in range(len(syntaxes)):
df['tokens' + syntaxes[i] + '_flat'] = df['tokens' + syntaxes[i]].apply(lambda list_token : [item for sublist in list_token for item in sublist])
return df
syntaxes = ['', '_clean', '_lemme']
%time data_clean = flatten_tokens(data_clean, syntaxes)
data_clean.head()
pos_data = data_clean.query("Sentiment=='Positive'")
neg_data = data_clean.query("Sentiment=='Negative'")
Let's look at few statistics on our dataset like the total number of words (vocabulary), the distribution of sentence lengths, etc.
all_words, all_words_clean, all_words_lemme = [[word for tokens in data_clean['tokens' + syntax + '_flat'] for word in tokens] for syntax in syntaxes]
sentence_lengths, sentence_lengths_clean, sentence_lengths_lemme = [[len(tokens) for tokens in data_clean['tokens' + syntax + '_flat']] for syntax in syntaxes]
VOCAB, VOCAB_CLEAN, VOCAB_LEMME = [sorted(list(set(all_words))) for all_words in [all_words, all_words_clean, all_words_lemme]]
print("%s words total for %s, with a vocabulary size of %s" % (len(all_words), 'tokens', len(VOCAB)))
print("Max sentence length is %s" % max(sentence_lengths))
print("%s words total for %s, with a vocabulary size of %s" % (len(all_words_clean), 'clean tokens', len(VOCAB_CLEAN)))
print("Max sentence length is %s" % max(sentence_lengths_clean))
print("%s words total for %s, with a vocabulary size of %s" % (len(all_words_lemme), 'lemmatized tokens', len(VOCAB_LEMME)))
print("Max sentence length is %s" % max(sentence_lengths_lemme))
LABELS = ['tokens' + syntax for syntax in syntaxes]
plt.bar(range(3), [len(VOCAB), len(VOCAB_CLEAN), len(VOCAB_LEMME)])
plt.xticks(range(3), LABELS)
plt.ylabel('Vocabulary size')
plt.xlabel('')
plt.title('Vocabulary size by degree of processing.')
plt.show()
pd.Series(sentence_lengths).describe()
plt.hist(sentence_lengths, bins=np.arange(0, 500, 50))
plt.show()
sentence_lengths_pos = [len(tokens) for tokens in pos_data['tokens_flat']]
sentence_lengths_neg = [len(tokens) for tokens in neg_data['tokens_flat']]
sentence_lengths_all = [sentence_lengths_pos, sentence_lengths_neg]
print([np.mean(sen) for sen in sentence_lengths_all])
print([np.median(sen) for sen in sentence_lengths_all])
def show_plots(data_pts, n_row=1, n_col=3, figsize=(15, 6), title_name='Histogram'):
fig, ax = plt.subplots(n_row, n_col, figsize=figsize)
n_pts = n_row*n_col
data_pts = data_pts[:n_pts]
for i, data_pt in enumerate(data_pts):
plt.subplot(n_row, n_col, i+1)
plt.hist(data_pt, bins=np.arange(0,500,25))
plt.title(title_name, fontsize=10)
plt.tight_layout(pad=0, h_pad=0, w_pad=0)
show_plots(sentence_lengths_all)
We want to capture the semantic meaning of words, meaning we need to understand that words like ‘good’ and ‘positive’ are closer than ‘apricot’ and ‘continent.’ The tool we will use to help us capture meaning is called Word2Vec.
Our dataset is a list of sentences, so in order for our algorithm to extract patterns from the data, we first need to find a way to represent it in a way that our algorithm can understand, i.e. as a list of numbers.
Using pre-trained words
Word2Vec is a technique to find continuous embeddings for words. It learns from reading massive amounts of text and memorizing which words tend to appear in similar contexts. After being trained on enough data, it generates a 300-dimension vector for each word in a vocabulary, with words of similar meaning being closer to each other.
Image(filename='../img/skip_gram.png')
Image(filename='../img/w2v.png')
EMBEDDING_DIM=300
wor2vec_path = os.path.join(data_path, 'word2vec/GoogleNews-vectors-negative300.bin.gz')
word2vec = gensim.models.KeyedVectors.load_word2vec_format(wor2vec_path, binary=True)
def load_glove(glove_path):
f = open(glove_path,'r')
model = {}
for line in f:
splitLine = line.split()
word = splitLine[0]
embedding = [float(val) for val in splitLine[1:]]
model[word] = embedding
return model
%%time
glove_path = os.path.join(data_path, 'glove/glove.6B.300d.txt')
glove = load_glove(glove_path)
from gensim.models.word2vec import Word2Vec
# train custom-made word2vec
def train_word2vec_model(n_dim, min_count, list_sentences, corpus_size, epochs):
vec_model = Word2Vec(size=n_dim, min_count=min_count)
vec_model.build_vocab(list_sentences)
vec_model.train(list_sentences, total_examples=corpus_size, epochs=epochs)
return vec_model
cols = ['tokens' + syntax + '_flat' for syntax in syntaxes]
cols
w2v_custom = [train_word2vec_model(n_dim=EMBEDDING_DIM, min_count=10, list_sentences=list(data_clean[col]), corpus_size=len(all_words), epochs=10) for col in cols]
embeddings_labels = ['w2v', 'amz', 'glv']
embedding_dict = {'w2v':word2vec, 'amz':w2v_custom, 'glv':glove}
embedding_dict_simple = {'w2v':word2vec, 'amz':w2v_custom[2], 'glv':glove}
def get_average_word2vec(tokens_list, w2v, generate_missing=False, k=300):
if len(tokens_list) < 1:
return np.zeros(k)
if generate_missing:
vectorized = [w2v[word] if word in w2v else np.random.rand(k) for word in tokens_list]
else:
vectorized = [w2v[word] if word in w2v else np.zeros(k) for word in tokens_list]
length = len(vectorized)
summed = np.sum(vectorized, axis=0)
averaged = np.divide(summed, length)
return averaged
def average_vectors(vectors, k=300):
length = len(vectors)
if length > 0:
try:
summed = np.sum(vectors, axis=0)
averaged = np.divide(summed, length)
except:
averaged = np.zeros(k)
else:
averaged = np.zeros(k)
return averaged
def get_word2vec_embeddings(embedding_representation, list_sentences, flat=False, generate_missing=False):
embeddings = [get_average_word2vec(sublist, embedding_representation, generate_missing=generate_missing) for sublist in list_sentences]
if not flat:
embeddings = average_vectors(embeddings)
return embeddings
def add_embedding(df, embedding_representation, output_prefix, flat=False, generate_missing=False):
df[output_prefix] = df['tokens'].apply(lambda x: get_word2vec_embeddings(embedding_representation, x, flat, generate_missing))
df[output_prefix + '_clean'] = df['tokens_clean'].apply(lambda x: get_word2vec_embeddings(embedding_representation, x, flat, generate_missing))
df[output_prefix + '_lemme'] = df['tokens_lemme'].apply(lambda x: get_word2vec_embeddings(embedding_representation, x, flat, generate_missing))
return df
def add_data_embeddings(df, embedding_dict, flat=False, generate_missing=False):
for key, value in embedding_dict.items():
print(' -- Adding ' + key + ' embedding -- ')
df = add_embedding(df, value, key)
return df
data_clean = add_data_embeddings(data_clean, embedding_dict_simple)
data_clean.head()
embeddings_w2v, embeddings_amz, embeddings_glv = [data_clean[col + '_clean'].tolist() for col in embeddings_labels]
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.manifold import TSNE
import matplotlib.patches as mpatches
%matplotlib inline
from pylab import *
def plot_LSA(test_data, test_labels, savepath="PCA_demo.csv", plot=True):
lsa = TruncatedSVD(n_components=2)
lsa.fit(test_data)
lsa_scores = lsa.transform(test_data)
color_mapper = {label:idx for idx,label in enumerate(set(test_labels))}
print(color_mapper)
color_column = [color_mapper[label] for label in test_labels]
colors = ['green','blue', 'red']
if plot:
plt.scatter(lsa_scores[:,0], lsa_scores[:,1], s=8, alpha=.8, c=test_labels, cmap=matplotlib.colors.ListedColormap(colors))
green_patch = mpatches.Patch(color='green', label='Positive')
red_patch = mpatches.Patch(color='red', label='Negative')
#blue_patch = mpatches.Patch(color='blue', label='Neutral')
plt.legend(handles=[green_patch, red_patch])#, blue_patch])
x=embeddings_amz
y=data_clean.ClassLabel.tolist()
data_clean.Sentiment.value_counts()
fig = plt.figure(figsize=(10, 10))
plot_LSA(x, y)
plt.show()
output_label = 'Sentiment'
def balance_dataset(df, output_label, sampling_type):
unique_outputs = list(set(df[output_label].tolist()))
datasets = [df.loc[df[output_label]==item] for item in unique_outputs]
sizes = [item.shape[0] for item in datasets]
min_size = min(sizes)
max_size = max(sizes)
if sampling_type=="over":
result_df = pd.concat([data.sample(max_size, replace=True) for data in datasets])
elif sampling_type=="down":
result_df = pd.concat([data.sample(min_size) for data in datasets])
result_df.reset_index(drop=True, inplace=True)
return result_df
balanced_data_clean = balance_dataset(data_clean, output_label, sampling_type="down")
print('Balanced dataset shape :', balanced_data_clean.shape)
balanced_data_clean.Sentiment.value_counts()
balanced_data_clean = balanced_data_clean.loc[balanced_data_clean.Sentiment!='Neutral']
balanced_data_clean.reset_index(drop=True, inplace=True)
balanced_data_clean.Sentiment.value_counts()
embeddings_w2v_blcd, embeddings_amz_blcd, embeddings_glv_blcd = [balanced_data_clean[col + '_clean'].tolist() for col in embeddings_labels]
x=embeddings_amz_blcd
y=balanced_data_clean.ClassLabel.tolist()
w2v_custom[2].most_similar('car')
Vectors :
X_w2vX_amzX_glvLabels :
yX_w2v, X_amz, X_glv = [balanced_data_clean[col + '_lemme'].tolist() for col in embeddings_labels]
y = balanced_data_clean.ClassLabel.tolist()
from sklearn.model_selection import train_test_split
def generate_data(X, y):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)
return X_train, X_test, y_train, y_test
X_w2v_train, X_w2v_test, y_train, y_test = generate_data(X_w2v, y)
X_amz_train, X_amz_test, _, _ = generate_data(X_amz, y)
X_glv_train, X_glv_test, _, _ = generate_data(X_glv, y)
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(C=30.0, class_weight='balanced', solver='newton-cg',
multi_class='multinomial', n_jobs=-1, random_state=40)
% time clf.fit(X_w2v_train, y_train)
y_w2v_predicted = clf.predict(X_w2v_test)
% time clf.fit(X_glv_train, y_train)
y_glv_predicted = clf.predict(X_glv_test)
% time clf.fit(X_amz_train, y_train)
y_amz_predicted = clf.predict(X_amz_test)
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
def get_metrics(y_test, y_predicted):
# true positives / (true positives+false positives)
precision = precision_score(y_test, y_predicted, pos_label=None,
average='weighted')
# true positives / (true positives + false negatives)
recall = recall_score(y_test, y_predicted, pos_label=None,
average='weighted')
# harmonic mean of precision and recall
f1 = f1_score(y_test, y_predicted, pos_label=None, average='weighted')
# true positives + true negatives/ total
accuracy = accuracy_score(y_test, y_predicted)
return accuracy, precision, recall, f1
accuracy, precision, recall, f1 = get_metrics(y_test, y_w2v_predicted)
print("accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))
accuracy, precision, recall, f1 = get_metrics(y_test, y_amz_predicted)
print("accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))
accuracy, precision, recall, f1 = get_metrics(y_test, y_glv_predicted)
print("accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))
'''
from sklearn import svm
clf = svm.SVC(kernel='linear', probability=True, random_state=40)
% time clf.fit(X_w2v_train, y_train)
y_w2v_predicted = clf.predict(X_w2v_test)
% time clf.fit(X_amz_train, y_train)
y_amz_predicted = clf.predict(X_amz_test)
% time clf.fit(X_glv_train, y_train)
y_glv_predicted = clf.predict(X_glv_test)
'''
#accuracy, precision, recall, f1 = get_metrics(y_test, y_w2v_predicted)
#print("accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))
#accuracy, precision, recall, f1 = get_metrics(y_test, y_amz_predicted)
#print("accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))
#accuracy, precision, recall, f1 = get_metrics(y_test, y_glv_predicted)
#print("accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))
from sklearn.metrics import confusion_matrix
import itertools
cm_w2v = confusion_matrix(y_test, y_w2v_predicted)
cm_amz = confusion_matrix(y_test, y_amz_predicted)
cm_glv = confusion_matrix(y_test, y_glv_predicted)
cm_w2v
cm_amz
cm_glv
balanced_data_clean.ClassLabel.value_counts()
from lime import lime_text
from sklearn.pipeline import make_pipeline
from lime.lime_text import LimeTextExplainer
X_train_data, X_test_data, y_train_data, y_test_data = train_test_split(balanced_data_clean.Text,
balanced_data_clean.ClassLabel,
test_size=0.2,
random_state=40)
vector_store = embedding_dict_simple['amz']
def word2vec_pipeline(examples):
global vector_store
tokenizer = RegexpTokenizer(r'\w+')
tokenized_list = []
for example in examples:
example_tokens = tokenizer.tokenize(example)
vectorized_example = get_average_word2vec(example_tokens, vector_store, generate_missing=False, k=300)
tokenized_list.append(vectorized_example)
return clf.predict_proba(tokenized_list)
#c = make_pipeline(count_vectorizer, clf)
def explain_one_instance(instance, class_names):
explainer = LimeTextExplainer(class_names=class_names)
exp = explainer.explain_instance(instance, word2vec_pipeline, num_features=6)
return exp
def visualize_one_exp(features, labels, index, class_names = ["Positive","Negative"]):
exp = explain_one_instance(features[index], class_names = class_names)
print('Index: %d' % index)
print('True class: %s' % class_names[labels[index]-1])
exp.show_in_notebook(text=True)
indexes = X_test_data.index
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
visualize_one_exp(X_test_data, y_test_data, indexes[0])
visualize_one_exp(X_test_data, y_test_data, indexes[3])
visualize_one_exp(X_test_data, y_test_data, indexes[5])
We’ve covered quick and efficient approaches to generate compact sentence embeddings. However, by omitting the order of words, we are discarding all of the syntactic information of our sentences. If these methods do not provide sufficient results, you can utilize more complex model that take in whole sentences as input and predict labels without the need to build an intermediate representation. A common way to do that is to treat a sentence as a sequence of individual word vectors using either Word2Vec or more recent approaches such as GloVe or CoVe. This is what we will do below.
Image(filename='../img/cnn_vision.png')
Image(filename='../img/cnn_text.png')
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
EMBEDDING_DIM = 300
MAX_SEQUENCE_LENGTH = 35
VOCAB_SIZE = len(VOCAB)
VALIDATION_SPLIT=.2
tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(balanced_data_clean["Text"].tolist())
sequences = tokenizer.texts_to_sequences(balanced_data_clean["Text"].tolist())
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
cnn_data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
balanced_data_clean["NewClassLabel"] = balanced_data_clean["ClassLabel"].map({1:0, 2:1})
labels = to_categorical(np.asarray(balanced_data_clean["NewClassLabel"]))
indices = np.arange(cnn_data.shape[0])
np.random.shuffle(indices)
cnn_data = cnn_data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * cnn_data.shape[0])
embedding_weights = np.zeros((len(word_index)+1, EMBEDDING_DIM))
for word,index in word_index.items():
embedding_weights[index,:] = embedding_dict_simple['amz'][word] if word in embedding_dict_simple['amz'] else np.random.rand(EMBEDDING_DIM)
print(embedding_weights.shape)
balanced_data_clean["ClassLabel"].value_counts()
balanced_data_clean["NewClassLabel"].value_counts()
from keras.layers import Dense, Input, Flatten, Dropout, Concatenate
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.layers import LSTM, Bidirectional
from keras.models import Model
def ConvNet(embeddings, max_sequence_length, num_words, embedding_dim, labels_index, trainable=False, extra_conv=True):
embedding_layer = Embedding(num_words,
embedding_dim,
weights=[embeddings],
input_length=max_sequence_length,
trainable=trainable)
sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
# Yoon Kim model (https://arxiv.org/abs/1408.5882)
convs = []
filter_sizes = [3,4,5]
for filter_size in filter_sizes:
l_conv = Conv1D(filters=128, kernel_size=filter_size, activation='relu')(embedded_sequences)
l_pool = MaxPooling1D(pool_size=3)(l_conv)
l_pool = Flatten()(l_pool)
convs.append(l_pool)
#l_merge = Merge(mode='concat', concat_axis=1)(convs)
l_merge = Concatenate()(convs)
# add a 1D convnet with global maxpooling, instead of Yoon Kim model
conv = Conv1D(filters=128, kernel_size=3, activation='relu')(embedded_sequences)
pool = MaxPooling1D(pool_size=3)(conv)
if extra_conv==True:
x = Dropout(0.5)(l_merge)
else:
# Original Yoon Kim model
x = Dropout(0.5)(pool)
#x = Flatten()(x)
x = Dense(128, activation='relu')(x)
#x = Dropout(0.5)(x)
preds = Dense(labels_index, activation='sigmoid')(x)
model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
optimizer='adam',
metrics=['acc'])
return model
x_train = cnn_data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = cnn_data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]
model = ConvNet(embedding_weights, MAX_SEQUENCE_LENGTH, len(word_index)+1, EMBEDDING_DIM,
len(list(balanced_data_clean["ClassLabel"].unique())), False)
#model_hist = model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=5, batch_size=128)
fastText is a library for efficient learning of word representations and sentence classification developed by Facebook Research
balanced_data_clean.head(2)
def add_fasttext_columns(df, shuffle=False, label_prefix='__label__'):
if shuffle:
df.sample(frac=1).reset_index(drop=True)
df['Sentiment_ft'] = label_prefix + df['Sentiment'].apply(lambda s : "".join(s.split())) + ' '
df['Comment_clean'] = df['tokens_flat'].apply(lambda x : ' '.join(x))
df['Comment_ft'] = ' ' + df['Comment_clean'] + ' '
return df
balanced_data_clean = add_fasttext_columns(balanced_data_clean)
balanced_data_clean[['Comment_ft', 'Sentiment_ft']].head()
data_labels = ['train', 'test']
X_train_data, X_test_data, y_train_data, y_test_data = train_test_split(balanced_data_clean,
balanced_data_clean.ClassLabel,
test_size=0.2,
random_state=40)
import fasttext
def fit_fasttext():
data_file_ft = [os.path.join(data_path, '_data_ft.' + data_label) for data_label in data_labels]
train_file_ft, test_file_ft = data_file_ft
[x.to_csv(y, header=None, index=False, columns=['Sentiment_ft','Comment_ft']) for x,y in zip([X_train_data, X_test_data], data_file_ft)]
model_file = os.path.join(model_path, 'fasttext_model')
classifier = fasttext.supervised(train_file_ft, model_file, dim=30, lr=0.1, epoch=15, min_count=1, word_ngrams=3, bucket=10000000, thread=12, label_prefix='__label__')
result = classifier.test(test_file_ft)
f1_score = 2*(result.precision*result.recall)/(result.precision + result.recall)
print('fasttext F1-score :', f1_score)
fit_fasttext()
Date: As of first iteration
Image(filename='../img/model_comparison.png')